{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Model selection\n",
    "\n",
    "In this notebook, we implement a similar functionality as in the example ```003_evaluation_one_dataset``` but using the ```model_selection``` function which simplifies the workflow by returning the oversampler and classifier combination providing the highest score."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os.path\n",
    "\n",
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "\n",
    "import smote_variants as sv\n",
    "\n",
    "import sklearn.datasets as datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# The model_selection procedure uses the cache_path directory for caching\n",
    "\n",
    "cache_path= os.path.join(os.path.expanduser('~'), 'smote_test')\n",
    "\n",
    "if not os.path.exists(cache_path):\n",
    "    os.makedirs(cache_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Specifying the dataset. Note that the datasets loaded from the imbalanced_learning package come with a 'name'\n",
    "# field which is used for labelling in the model selection functions, but the datasets loaded from \n",
    "# sklearn.datasets lack the 'name' field, therefore, we need to add it manually.\n",
    "\n",
    "dataset= datasets.load_breast_cancer()\n",
    "dataset= {'data': dataset['data'], 'target': dataset['target'], 'name': 'breast_cancer'}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Specifying the classifiers.\n",
    "\n",
    "knn_classifier= KNeighborsClassifier()\n",
    "dt_classifier= DecisionTreeClassifier()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2019-06-11 18:18:41,886:INFO:dataset: breast_cancer, samplings_available: True, evaluations_available: True\n",
      "2019-06-11 18:18:41,888:INFO:doing the folding\n",
      "2019-06-11 18:18:41,993:INFO:Folding reading from file folding_breast_cancer.pickle\n",
      "2019-06-11 18:18:42,598:INFO:do the samplings\n",
      "2019-06-11 18:18:42,599:INFO:create sampling objects\n",
      "2019-06-11 18:18:42,601:INFO:executing 72 sampling in parallel\n",
      "2019-06-11 18:19:15,521:INFO:do the evaluations\n",
      "2019-06-11 18:19:15,521:INFO:create classifier jobs\n",
      "2019-06-11 18:19:15,549:INFO:executing 72 evaluation jobs in parallel\n",
      "2019-06-11 18:19:16,119:INFO:concatenating the results\n",
      "2019-06-11 18:19:16,381:INFO:aggregating the results\n"
     ]
    }
   ],
   "source": [
    "# Executing the model selection using 5 parallel jobs and at most 35 random but meaningful parameter combinations\n",
    "# with the oversamplers.\n",
    "\n",
    "samp_obj, cl_obj= sv.model_selection(dataset= dataset,\n",
    "                                        samplers= sv.get_n_quickest_oversamplers(5),\n",
    "                                        classifiers= [knn_classifier, dt_classifier],\n",
    "                                        cache_path= cache_path,\n",
    "                                        n_jobs= 5,\n",
    "                                        max_samp_par_comb= 35)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2019-06-11 18:19:16,487:INFO:OUPS: Running sampling via ('OUPS', \"{'proportion': 0.1, 'n_jobs': 1}\")\n",
      "/home/gykovacs/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:758: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations.\n",
      "  \"of iterations.\", ConvergenceWarning)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n",
       "           metric_params=None, n_jobs=None, n_neighbors=5, p=2,\n",
       "           weights='uniform')"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Oversampling and training the classifier providing the best results in the model selection procedure\n",
    "\n",
    "X_samp, y_samp= samp_obj.sample(dataset['data'], dataset['target'])\n",
    "cl_obj.fit(X_samp, y_samp)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
